Introduction

We’ll be working with a mental health dataset and will be conducting exploratory data analysis, unsupervised clustering, principal component analysis, gradient boosting, and support vector machines.

Import Data

To begin, the following code will import the data and load the libraries:

library(stringr)
library(tidyr)
library(dplyr)
library(ggplot2)
library(VIM)
library(corrplot)
library(purrr)
library(scales)
library(caret)
library(Hmisc)
library(naniar)
library(conflicted)
library(pheatmap)
library(corrplot)
library(factoextra)
library(gbm)
library(caret)
library(gridExtra)

# resolve function name conflict
conflict_prefer('filter', 'dplyr')
conflict_prefer('summarize', 'dplyr')

# import data
url <- 'https://raw.githubusercontent.com/SmilodonCub/Data622_group5_projects/main/ADHD_data.csv'
df <- read.csv(url, header=T, na.strings="")


Variable Datatypes

In order to facilitate ease of use, we’ll be renaming the columns. Additionally, we’ll convert each of the number coded fields to factors while also including the proper labels.

# convert column names to lowercase
names(df) <- lapply(names(df), tolower)

# replace periods with underscore
names(df) <- str_replace_all(names(df), '\\.', '_')

# rename last column to remove trailing underscore
names(df)[ncol(df)] <- 'psych_meds'

# raw data with renamed columns
df_raw <- df

names(df_raw)
##  [1] "initial"            "age"                "sex"               
##  [4] "race"               "adhd_q1"            "adhd_q2"           
##  [7] "adhd_q3"            "adhd_q4"            "adhd_q5"           
## [10] "adhd_q6"            "adhd_q7"            "adhd_q8"           
## [13] "adhd_q9"            "adhd_q10"           "adhd_q11"          
## [16] "adhd_q12"           "adhd_q13"           "adhd_q14"          
## [19] "adhd_q15"           "adhd_q16"           "adhd_q17"          
## [22] "adhd_q18"           "adhd_total"         "md_q1a"            
## [25] "md_q1b"             "md_q1c"             "md_q1d"            
## [28] "md_q1e"             "md_q1f"             "md_q1g"            
## [31] "md_q1h"             "md_q1i"             "md_q1j"            
## [34] "md_q1k"             "md_q1l"             "md_q1m"            
## [37] "md_q2"              "md_q3"              "md_total"          
## [40] "alcohol"            "thc"                "cocaine"           
## [43] "stimulants"         "sedative_hypnotics" "opioids"           
## [46] "court_order"        "education"          "hx_of_violence"    
## [49] "disorderly_conduct" "suicide"            "abuse"             
## [52] "non_subst_dx"       "subst_dx"           "psych_meds"
# Sex
df$sex <- factor(df$sex, levels = c(1,2), labels = c('Male','Female'))

# Race
df$race <- factor(df$race, levels = c(1,2,3,4,5,6), labels = c('White','African American','Hispanic','Asian','Native American','Other or Missing Data'))

# ADHD q1 - q18
adhd_cols <- names(df[,5:22])
df[adhd_cols] <- lapply(df[adhd_cols], factor, levels = c(0,1,2,3,4), labels = c('Never','Rarely','Sometimes','Often','Very Often')) 

# Mood Disorder q1a - q2
md_cols <- names(df[,24:37])
df[md_cols] <- lapply(df[md_cols], factor, levels = c(0,1), labels = c('No','Yes')) 

# Mood Disorder q3
df$md_q3 <- factor(df$md_q3, levels = c(0,1,2,3), labels = c('No Problem','Minor','Moderate','Serious')) 

# Substance Abuse
sa_cols <- names(df[,40:45])
df[sa_cols] <- lapply(df[sa_cols], factor, levels = c(0,1,2,3), labels = c('No Use','Use','Abuse','Dependence')) 

# Court Order
df$court_order <- factor(df$court_order, levels = c(0,1), labels = c('No','Yes'))

# Education
# think it might be okay to leave this as a number

# History of Violence, Disorderly Conduct, Suicide Attempt
hist_cols <- names(df[,48:50])
df[hist_cols] <- lapply(df[hist_cols], factor, levels = c(0,1), labels = c('No','Yes'))

# Abuse History
df$abuse <- factor(df$abuse, levels = c(0,1,2,3,4,5,6,7), 
                   labels = c('No','Physical','Sexual','Emotional','Physical & Sexual','Physical & Emotional','Sexual & Emotional','Physical, Sexual, & Emotional'))

# Non-Substance Related Drugs
df$non_subst_dx <- factor(df$non_subst_dx, levels = c(0,1,2), labels = c('None','One','More than one'))

# Substance Related Drugs
df$subst_dx <- factor(df$subst_dx, levels = c(0,1,2,3), labels = c('None','One','Two','Three or more'))

# Psychiatric Meds
df$psych_meds <- factor(df$psych_meds, levels = c(0,1,2), labels = c('None','One','More than one'))

# str(df)

Exploratory Data Analysis

The following code will quantitatively and visually explore the nature of the dataset.

We begin by describing the dataset features.

Use dplyr’s glimpse() function to take a quick look at the data structure. Followed by Hmisc’s describe() function to return some basic summary statistics about the dataframe features:

# quick look at what the data structure looks like
glimpse(df)
## Rows: 175
## Columns: 54
## $ initial            <chr> "JA", "LA", "MD", "RD", "RB", "SB", "PK", "RJ", "DJ…
## $ age                <int> 24, 48, 51, 43, 34, 39, 41, 48, 44, 27, 44, 56, 53,…
## $ sex                <fct> Male, Female, Female, Male, Male, Female, Female, M…
## $ race               <fct> White, White, White, White, White, White, White, Wh…
## $ adhd_q1            <fct> Rarely, Often, Sometimes, Often, Very Often, Someti…
## $ adhd_q2            <fct> Rarely, Often, Rarely, Often, Very Often, Often, So…
## $ adhd_q3            <fct> Very Often, Very Often, Sometimes, Sometimes, Somet…
## $ adhd_q4            <fct> Sometimes, Very Often, Rarely, Sometimes, Very Ofte…
## $ adhd_q5            <fct> Often, NA, Often, Very Often, Very Often, Often, Ve…
## $ adhd_q6            <fct> Rarely, Sometimes, Often, Often, Sometimes, Sometim…
## $ adhd_q7            <fct> Rarely, Sometimes, Often, Sometimes, Often, Often, …
## $ adhd_q8            <fct> Often, Often, Sometimes, Very Often, Very Often, Ve…
## $ adhd_q9            <fct> Sometimes, Sometimes, Never, Very Often, Very Often…
## $ adhd_q10           <fct> Very Often, Very Often, Rarely, Sometimes, Sometime…
## $ adhd_q11           <fct> Sometimes, Rarely, Sometimes, Often, Very Often, Ve…
## $ adhd_q12           <fct> Very Often, Very Often, Never, Rarely, Rarely, Some…
## $ adhd_q13           <fct> Rarely, Sometimes, Sometimes, Often, Often, Very Of…
## $ adhd_q14           <fct> Never, Very Often, Sometimes, Often, Sometimes, Ver…
## $ adhd_q15           <fct> Often, Very Often, Often, Rarely, Rarely, Often, Ve…
## $ adhd_q16           <fct> Rarely, Often, Sometimes, Sometimes, Sometimes, Ver…
## $ adhd_q17           <fct> Often, Rarely, Rarely, Rarely, Rarely, Often, Somet…
## $ adhd_q18           <fct> Very Often, Very Often, Rarely, Sometimes, Rarely, …
## $ adhd_total         <int> 40, 55, 31, 45, 48, 55, 54, 41, 56, 56, 42, 38, 31,…
## $ md_q1a             <fct> Yes, Yes, No, Yes, No, No, Yes, No, Yes, Yes, Yes, …
## $ md_q1b             <fct> Yes, Yes, No, Yes, Yes, Yes, Yes, No, Yes, Yes, Yes…
## $ md_q1c             <fct> Yes, Yes, No, No, No, No, No, No, No, No, Yes, No, …
## $ md_q1d             <fct> Yes, Yes, No, No, Yes, Yes, No, No, Yes, No, Yes, N…
## $ md_q1e             <fct> No, Yes, Yes, Yes, No, Yes, Yes, No, Yes, Yes, Yes,…
## $ md_q1f             <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, Ye…
## $ md_q1g             <fct> Yes, Yes, Yes, Yes, Yes, Yes, No, Yes, Yes, Yes, Ye…
## $ md_q1h             <fct> Yes, Yes, No, Yes, No, Yes, No, No, No, No, Yes, No…
## $ md_q1i             <fct> Yes, Yes, No, Yes, No, Yes, No, No, No, No, Yes, No…
## $ md_q1j             <fct> Yes, No, No, No, No, Yes, No, No, No, Yes, No, No, …
## $ md_q1k             <fct> Yes, No, No, No, No, Yes, No, No, No, Yes, Yes, No,…
## $ md_q1l             <fct> No, Yes, No, Yes, No, Yes, Yes, Yes, Yes, Yes, Yes,…
## $ md_q1m             <fct> Yes, No, No, Yes, No, No, No, No, Yes, Yes, Yes, No…
## $ md_q2              <fct> Yes, Yes, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye…
## $ md_q3              <fct> Serious, Serious, Moderate, Serious, Moderate, Seri…
## $ md_total           <int> 15, 14, 5, 13, 7, 14, 9, 7, 12, 11, 16, 0, 11, 10, …
## $ alcohol            <fct> Use, No Use, No Use, Use, Use, Use, Dependence, No …
## $ thc                <fct> Use, No Use, No Use, Use, Use, No Use, Dependence, …
## $ cocaine            <fct> Use, No Use, No Use, Use, No Use, No Use, Use, No U…
## $ stimulants         <fct> No Use, No Use, No Use, Use, No Use, No Use, Use, N…
## $ sedative_hypnotics <fct> No Use, No Use, No Use, No Use, No Use, No Use, Use…
## $ opioids            <fct> No Use, No Use, No Use, No Use, No Use, No Use, No …
## $ court_order        <fct> Yes, No, No, No, Yes, No, No, No, No, No, No, No, N…
## $ education          <int> 11, 14, 12, 12, 9, 11, 12, 16, 12, 9, 12, 18, 12, 1…
## $ hx_of_violence     <fct> No, No, No, No, Yes, No, Yes, Yes, Yes, No, Yes, No…
## $ disorderly_conduct <fct> Yes, No, No, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes,…
## $ suicide            <fct> Yes, Yes, No, Yes, Yes, Yes, No, No, No, No, Yes, N…
## $ abuse              <fct> "No", "Physical & Sexual", "Sexual & Emotional", "P…
## $ non_subst_dx       <fct> More than one, One, More than one, More than one, M…
## $ subst_dx           <fct> None, None, None, None, None, None, None, One, None…
## $ psych_meds         <fct> More than one, One, One, More than one, None, None,…

From this output, we can summarize each dataset feature as follows:

Column Numbers Column Labels Description
1 initial (string) subject’s initials
2 age (numeric) integer values (years)
3 sex (categorical): binary (‘male’ and ‘female’)
4 race (categorical)
5-22 adhd_q1-adhd_q18 (categorical) ordinal values
23 adhd_total (numeric): summary feature derived from adhd_q1-adhd_q18
24-38 md_q1a-md_q3 (categorical) ordinal values
39 md_total (numeric): summary feature derived from md_q1a-md_q3
40 alcohol (categorical): ordinal values
41 thc (categorical): ordinal values
42 cocaine (categorical): ordinal values
43 stimulants (categorical): ordinal values
44 sedative_hypnotics (categorical): ordinal values
45 opioids (categorical): ordinal values
46 court_order (categorical): binary (yes/no)
47 education (numeric): interger values (years)
48 hx_of_violence (categorical): binary (yes/no)
49 disorderly_conduct (categorical): binary (yes/no)
50 suicide (categorical): binary (yes/no)
51 abuse (categorical): ordinal values
52 non_subst_dx (categorical): ordinal values
53 subst_dx (categorical): ordinal values
54 psych_meds (categorical): ordinal values

The columns adhd_q1-adhd_q18 and md_q1a-md_q3 are summarized by the derivative columns adhd_total and md_total respectively. The adhd features correspond to an ADHD self-report survey whereas the md features give responses to a mood disorder self-report survey. For the initial Exploratory Data Analysis, the individual questions responses will be dropped in place of visualizations and summary statistics on the derived columns, adhd_total and md_total. A detailed analysis of the individual survey questions will be taken up in the Principal Components Analysis section.

Next, we visualize the extent of the missing values using the naniar library.

Missing Values

Use naniar’s miss_var_summary() and vis_miss() functions to summarize and visualize the missing values in the features of the dataset:

# return a summary table of the missing data in each column
miss_var_summary(df)
## # A tibble: 54 × 3
##    variable           n_miss pct_miss
##    <chr>               <int>    <dbl>
##  1 psych_meds            118    67.4 
##  2 subst_dx               23    13.1 
##  3 non_subst_dx           22    12.6 
##  4 abuse                  14     8   
##  5 suicide                13     7.43
##  6 hx_of_violence         11     6.29
##  7 disorderly_conduct     11     6.29
##  8 education               9     5.14
##  9 court_order             5     2.86
## 10 alcohol                 4     2.29
## # … with 44 more rows
# visualize the amount of missing data for each feature
vis_miss( df, cluster = TRUE )

The figure above shows a grouped view of the missing values in each feature column. Overall, 2.7% of the values are missing from the dataset. The majority of the features have no missing values. Many of the features have relatively few missing values. However, the psych_meds features is missing a considerable portion of the data.

df <- df %>%
  select( -c(psych_meds) )

Having dropped psych_meds, we can now use the gg_miss_upset() function to look for any remaining patterns of missing values in the data.

gg_miss_upset( df )

Both the vis_miss() and gg_miss_upset() visualization are informative, because they reveal some interesting patterns in the missing values for the data set. For instance, once we remove psych_meds most of the remaining rows that are not full, hold multiple missing values from a subset of the features. In other words, the remaining missing values are not randomly dispersed across the data set. Rather, they are concentrated in a subset of the features: disorderly_conduct, suicide, abuse, non_subst_dx and subst_dx. Notably, the responses that correspond to the ADHD and Mood Disorder self-reporting surveys are have no missing values; this knowledge will be helpful downstream. Further handling of missing values will be tailored to the analysis of each of the sections below.

Exploratory Data Visualizations

To gain a feel for the data, we will evaluate visualizations of select features.

Distributions of Demographic Variables

Summary Statistics of Demographic Variables

# summary of each demographic feature
demo_df <- df %>%
  select(c('age','sex','race','education'))#select( -c( 5:22, 24:38 ) )
describe( demo_df )
## demo_df 
## 
##  4  Variables      175  Observations
## --------------------------------------------------------------------------------
## age 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      175        0       42    0.999    39.47     12.8     22.0     24.0 
##      .25      .50      .75      .90      .95 
##     29.5     42.0     48.0     53.0     56.0 
## 
## lowest : 18 19 20 21 22, highest: 55 56 57 61 69
## --------------------------------------------------------------------------------
## sex 
##        n  missing distinct 
##      175        0        2 
##                         
## Value        Male Female
## Frequency      99     76
## Proportion  0.566  0.434
## --------------------------------------------------------------------------------
## race 
##        n  missing distinct 
##      175        0        4 
##                                                                             
## Value                      White      African American              Hispanic
## Frequency                     72                   100                     1
## Proportion                 0.411                 0.571                 0.006
##                                 
## Value      Other or Missing Data
## Frequency                      2
## Proportion                 0.011
## --------------------------------------------------------------------------------
## education 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      166        9       14    0.929     11.9    2.265     8.25     9.00 
##      .25      .50      .75      .90      .95 
##    11.00    12.00    13.00    14.00    16.00 
## 
## lowest :  6  7  8  9 10, highest: 15 16 17 18 19
##                                                                             
## Value          6     7     8     9    10    11    12    13    14    15    16
## Frequency      2     2     5    12    12    23    67    15    14     1     7
## Proportion 0.012 0.012 0.030 0.072 0.072 0.139 0.404 0.090 0.084 0.006 0.042
##                             
## Value         17    18    19
## Frequency      2     3     1
## Proportion 0.012 0.018 0.006
## --------------------------------------------------------------------------------

Visualizing the demographic information of the mental health study

p1 <- ggplot( df, aes(x = age) ) +
  geom_density( fill="#69b3a2", color="#e9ecef", alpha=0.8 ) +
  theme_classic() +
  ggtitle( 'Age Density Plot' )
num_obs <- dim(df)[1]
p2 <- df %>%
  count( sex ) %>%
  mutate( n = n/num_obs ) %>%
  ggplot( aes(x=sex, y=n)) + 
  geom_bar(stat = "identity") +
  ylim( 0, 0.6 ) +
  ylab( 'proportion' ) +
  ggtitle( 'Distribution  of Sex (binary)' ) +
  theme_classic()
p3 <- df %>%
  count( race ) %>%
  mutate( n = n/num_obs ) %>%
  ggplot( aes(x=race, y=n)) + 
  geom_bar(stat = "identity") +
  ylab( 'proportion' ) +
  ylim( 0, 0.6 ) +
  ggtitle( 'Distribution  of Race' ) +
  theme_classic() + 
  scale_x_discrete(guide = guide_axis(n.dodge = 2))
p4 <- ggplot( df, aes(x = education) ) +
  geom_density( fill="#69b3a2", color="#e9ecef", alpha=0.8 ) +
  theme_classic() +
  ggtitle( 'Education Density Plot' )
num_obs <- dim(df)[1]
grid.arrange(p1, p4, p2, p3, ncol=2)

From these visualization we learn several things about the mental health study participants:

  1. Age - the age distribution is bimodal with one peak at approximate 26 and another at 46 and a range from 18 to 69 years of age.
  2. Education - education has an interesting envelope. There is a sharp peak at ~12years (high school graduates) with long kurtosis to either side
  3. Sex - sex is given as a binary feature and is not equally balanced with 57% Male and 43% Female
  4. Race - race takes one of four values: ‘White’ (41%), ‘African American’ (57%), ‘Hispanic’(.6%) and ‘Other or Missing Data’(1%).

ADHD and Mood Disorder Survey Scores

Summary Statistics for self-report surveys

# summary of each field
initial_eda_df <- df %>%
  select( c('adhd_total', 'md_total' ) )
describe( initial_eda_df )
## initial_eda_df 
## 
##  2  Variables      175  Observations
## --------------------------------------------------------------------------------
## adhd_total 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      175        0       62    0.999    34.32    19.16      7.0     12.0 
##      .25      .50      .75      .90      .95 
##     21.0     33.0     47.5     55.0     62.3 
## 
## lowest :  0  1  3  5  6, highest: 65 67 69 71 72
## --------------------------------------------------------------------------------
## md_total 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      175        0       18    0.995    10.02    5.469      0.7      3.0 
##      .25      .50      .75      .90      .95 
##      6.5     11.0     14.0     16.0     17.0 
## 
## lowest :  0  1  2  3  4, highest: 13 14 15 16 17
##                                                                             
## Value          0     1     2     3     4     5     6     7     8     9    10
## Frequency      9     3     5     6     4     7    10     6     8    12    13
## Proportion 0.051 0.017 0.029 0.034 0.023 0.040 0.057 0.034 0.046 0.069 0.074
##                                                     
## Value         11    12    13    14    15    16    17
## Frequency     18    12    13    12    14    12    11
## Proportion 0.103 0.069 0.074 0.069 0.080 0.069 0.063
## --------------------------------------------------------------------------------

Visualizing the adhd_total and md_total features. These two features give a summary score for the questions on an ADHD Self-Reporting Survey and a separate Mood Disorder Questionnaire.

p1 <- ggplot( df, aes(x = adhd_total) ) +
  geom_density( fill="#69b3a2", color="#e9ecef", alpha=0.8 ) +
  theme_classic() +
  ggtitle( 'ADHD Self-Reporting Survey Scores' )
p2 <- ggplot( df, aes(x = md_total) ) +
  geom_density( fill="#9D85FC", color="#e9ecef", alpha=0.8 ) +
  theme_classic() +
  ggtitle( 'Mood Disorder Self-Reporting Survey Scores' )
grid.arrange( p1, p2, ncol=2 )

adhd_total has a roughly normal distribution with a mean centered at 34.3 and a range of 0 to 72. md_total has an entirely different range from 0 to 17; it’s distribution is more left-skewed than ADHD.

Next we visualize adhd_total ~ md_total to evaluate the relationship between the two total scores

ggplot(df, aes(x=md_total, y=adhd_total)) + 
  geom_point()+
  geom_smooth(method=lm) +
  theme_classic() +
  ggtitle('ADHD total ~ Mood Disorder total')

We can evaluate the fit of a linear model to the data:

fit = lm( adhd_total ~ md_total, df )
summary( fit )
## 
## Call:
## lm(formula = adhd_total ~ md_total, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -39.718  -8.672  -1.941   9.163  38.713 
## 
## Coefficients:
##             Estimate Std. Error t value          Pr(>|t|)    
## (Intercept)  16.5104     2.5173   6.559 0.000000000603804 ***
## md_total      1.7769     0.2265   7.844 0.000000000000432 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.38 on 173 degrees of freedom
## Multiple R-squared:  0.2623, Adjusted R-squared:  0.2581 
## F-statistic: 61.53 on 1 and 173 DF,  p-value: 0.0000000000004319
par(mfrow = c(2, 2))
p1 <- plot(fit, which=1)
p2 <- plot(fit, which=2)
p3 <- plot(fit, which=3)
p4 <- plot(fit, which=4)

par(mfrow = c(1, 1))

The fit is statistically significant and the diagnostics suggest that the assumptions for linearity are reasonably met. Therefore, we can say with confidence that for a given point increase in Mood Disorder Score, the average ADHD score increases 1.7769 points.

For an EDA of the individual question responses for both the Mood Disorder and ADHD self-reporting surveys, please see the PCA section.

Substance Misuse and other Features

Summary Statistics for Substance Misuse Features

# summary of each field
sub_df <- df %>%
  select( c( 'alcohol', 'thc', 'cocaine', 'stimulants', 'sedative_hypnotics', 'opioids', 'subst_dx' ) )
describe( sub_df )
## sub_df 
## 
##  7  Variables      175  Observations
## --------------------------------------------------------------------------------
## alcohol 
##        n  missing distinct 
##      171        4        4 
##                                                       
## Value          No Use        Use      Abuse Dependence
## Frequency          80         18          7         66
## Proportion      0.468      0.105      0.041      0.386
## --------------------------------------------------------------------------------
## thc 
##        n  missing distinct 
##      171        4        4 
##                                                       
## Value          No Use        Use      Abuse Dependence
## Frequency         116         12          3         40
## Proportion      0.678      0.070      0.018      0.234
## --------------------------------------------------------------------------------
## cocaine 
##        n  missing distinct 
##      171        4        4 
##                                                       
## Value          No Use        Use      Abuse Dependence
## Frequency         101          9          5         56
## Proportion      0.591      0.053      0.029      0.327
## --------------------------------------------------------------------------------
## stimulants 
##        n  missing distinct 
##      171        4        3 
##                                            
## Value          No Use        Use Dependence
## Frequency         160          6          5
## Proportion      0.936      0.035      0.029
## --------------------------------------------------------------------------------
## sedative_hypnotics 
##        n  missing distinct 
##      171        4        4 
##                                                       
## Value          No Use        Use      Abuse Dependence
## Frequency         161          4          1          5
## Proportion      0.942      0.023      0.006      0.029
## --------------------------------------------------------------------------------
## opioids 
##        n  missing distinct 
##      171        4        3 
##                                            
## Value          No Use        Use Dependence
## Frequency         146          4         21
## Proportion      0.854      0.023      0.123
## --------------------------------------------------------------------------------
## subst_dx 
##        n  missing distinct 
##      152       23        4 
##                                                                   
## Value               None           One           Two Three or more
## Frequency             42            61            35            14
## Proportion         0.276         0.401         0.230         0.092
## --------------------------------------------------------------------------------

The individual substance features (e.g. opioids) describe the severity with which a participants uses or does not use a particular substance. On the other hand, subst_dx counts how many substances a participant has a reported using irrespective of severity of use. The following figure will attempt to find patterns between the individual substances and the summary feature.

none_count <- sum(sub_df$subst_dx == 'None')
one_count <- sum(sub_df$subst_dx == 'One')
two_count <- sum(sub_df$subst_dx == 'Two')
more_count <- sum(sub_df$subst_dx == 'Three or more')
total_count <- dim(sub_df)[1]
  
plot_df <- sub_df %>%
  drop_na() %>%
  #mutate_if(is.factor, as.numeric) %>%
  gather(var, value, -subst_dx) %>%
  group_by(var, value, subst_dx) %>%
  dplyr::summarize(count = n(),
            .groups = 'drop') %>%
  dplyr::mutate(prop = count / case_when(subst_dx == 'None' ~ none_count, 
                                         subst_dx == 'One' ~ one_count,
                                         subst_dx == 'Two' ~ two_count,
                                         subst_dx == 'Three or more' ~ more_count))
plot_df$value <- factor(plot_df$value,levels = c("No Use", "Use", "Dependence", "Abuse"))

ggplot(plot_df, aes(x = value, y = count, fill = subst_dx)) +
  geom_col(position="stack") +#, position = 'dodge') +
  facet_wrap(~var, scales = 'free') +
  theme_bw() +
  labs(y = 'count',
       x = element_blank(),
       title = 'Distributions For Substances') #+

  #scale_y_continuous(labels = percent_format(accuracy = 1))

From the figure above, we see that for each individual substance, the majority of participants report ‘No use’. However, the instances of ‘Dependence’ are elevated for several substances notably alcohol, cocaine and THC. For these substances it is the case that there is a substantial proportion of participants that also reported a substance abuse diagnosis (subst_dx) of three or more substances. There is another interesting result in this figure: there are many instances where a level of substance use was self-reported for a participant that also reported ‘None’ for substance abuse diagnosis. It is interesting that a participant can simultaneously report to, for example, have a dependence on alcohol yet report ‘None’ for a substance abuse diagnosis. Perhaps more documentation on the dataset can shed light on this.

To re-represent this data, we will generate a new summary feature that sums the reported sevarity of use for each substance included in the self report:

sub_df <- sub_df %>%
  mutate_if(is.factor, as.numeric) %>%
  mutate( subst_sum = alcohol + thc + cocaine + stimulants + sedative_hypnotics + opioids )

We can derive a similar summary variable for the non-substance abuse behaviors reported (e.g. disorderly conduct, court order etc.). Hoever, we exclude abuse because it has a difference and non-binary reporting criteria.

abuse_df <- df %>%
  select( c( 'suicide', 'disorderly_conduct', 'hx_of_violence', 'court_order' ) ) %>%
  mutate_if(is.factor, as.numeric) %>%
  mutate( abuse_sum = suicide + disorderly_conduct + hx_of_violence + court_order )

Visualize the distribution of the new summary variables

p1 <- ggplot( sub_df, aes(x = subst_sum) ) +
  geom_density( fill="#69b3a2", color="#e9ecef", alpha=0.8 ) +
  theme_classic() +
  ggtitle( 'ADHD Self-Reporting Survey Scores' )
p2 <- ggplot( abuse_df, aes(x = abuse_sum) ) +
  geom_density( fill="#9D85FC", color="#e9ecef", alpha=0.8 ) +
  theme_classic() +
  ggtitle( 'Mood Disorder Self-Reporting Survey Scores' )
grid.arrange( p1, p2, ncol=2 )

For one last exploratory visualization of the data, we will look at correlations between the self-reporting survey summary features (adhd_total & md_total) and the new summary feature generated here. Obviously, a positive correlation is expected between the two survey totals, because we demonstrated the linear regression earlier.

df_feats <- df %>%
  select( c('adhd_total','md_total') ) %>%
  mutate( subst_sum = as.integer( sub_df$subst_sum ),
          abuse_sum = as.integer( abuse_df$abuse_sum ) ) %>%
  drop_na()

corrplot(cor(df_feats), method="number")

From the correlation plot above, we do not see a strong relationship between the self-reported survey total scores and the new features we generated to summarize substance abuse and non-substance abuses. However, in upcoming sections we will use machine learning techniques to investigate this relationship further.

Clustering Methods

Finding K

In order to find patients of clusters, we first need to determine the number of clusters. Below, we’ll use two common methods, evaluating the sum of squares and silhouette width, in order to decide how many clusters we’ll use.

# exclude names
df3 <- df_raw %>%
  select(-initial)

# impute null
preproc <- preProcess(df3, 'bagImpute')
df4 <- predict(preproc, df3)

Total Within Sum of Square

fviz_nbclust(df4, kmeans, method = "wss")

Based on the sum of squares for each value of K, we can use 2 clusters as the ‘elbow’ point.

Average Siluotte Width

fviz_nbclust(df4, kmeans, method = "silhouette")

Based on the average silhouette width of clusters, we can proceed with K = 2.

In this case, both methods have a consensus on two clusters.

Calculating Clusters

Once the clusters are calculated, let’s check the size of each cluster to make sure we’re looking at different groups.

# 25 random starts
cluster_k2 <- kmeans(df4, 2, nstart = 25)

# add cluster number to dataframe
df4$cluster_k2 <- cluster_k2$cluster

# check cluster sizes
table(df4$cluster_k2)
## 
##  1  2 
## 88 87

Using K = 2, we created an almost even split of observations. Cluster 1 has 88 observations and cluster 2 has 87.

ADHD Total vs Suicide Plot

# adhd vs suicide
fviz_cluster(cluster_k2, data = df4, c('adhd_total', 'suicide'))

The clusters created show a clear delineation for respondents based on adhd_total. This suggests that the clusters are separated based on levels of ADHD.

Mood Disorder Total vs Suicide Plot

# mood disorder vs suicide
fviz_cluster(cluster_k2, data = df4, c('md_total', 'suicide'))

The results of the k2 cluster show considerable overlap based on md_total. This means that the clusters aren’t separated by mood disorder levels.

Relative Cluster Differences

Let’s examine relative distributions for each cluster and variable.

# function to plot relative variable distributions for clusters
cluster_plot <- function(start_col, end_col, plot = 'density', legend_position = 'bottom') {

  temp_df <- df4[,start_col:end_col] %>%
    bind_cols(cluster = factor(df4$cluster_k2)) %>%
    gather(var, val, -cluster)

  if (plot == 'density') {
    p <- ggplot(temp_df, aes(x = val, fill = cluster)) +
    geom_density(alpha = 0.3)
  } else if (plot == 'histogram') {
    p <- ggplot(temp_df, aes(x = val, fill = cluster)) +
    geom_histogram(alpha = 0.3)
  }
  
  p +
    facet_wrap(~var, scales = 'free') +
    theme_bw() +
    labs(fill = 'Cluster',
         x = element_blank(),
         y = element_blank()
         ) +
    theme(axis.ticks.y = element_blank(),
          axis.text.y = element_blank(),
          axis.text.x = element_text(size = 8),
          legend.position = legend_position)

}

Demographics

cluster_plot(1,3)

  • Cluster 1 show a bimodal distribution of age, with modes as 25 and 45 years old. Cluster 2 has fewer younger people with a higher single mode.
  • Cluster 2 is more likely to be African American.
  • Cluster 1 is more likely to be female and Cluster 2 is more likely to be male.

ADHD

cluster_plot(4,22)

With clear differences for each question as well as the total, cluster 1 reported higher adhd values relative to cluster 2.

Mood Disorders

cluster_plot(23,38)

Some questions have similiar distributions, but generally overall as well as altogether, cluster 1 is more likely to report higher in mood disorders.

Substance Abuse

cluster_plot(39,44)

  • Alcohol: Both clusters show similar usage patterns for alcohol, with cluster 1 having a slightly higher chance of using, abusing, and being dependent. Cluster 2 has a higher chance of not drinking.
  • Cocaine: Cluster 1 is twice as likely as cluster 2 to not use cocaine and cluster 2 has a relatively slightly higher chance of being dependent.
  • Opioids: Cluster 2 has a higher chance of not using.
  • Sedative Hypnotics: Cluster 2 has a slighly higher chance of not using.
  • Stimulants: Cluster 2 mostly does not use.
  • THC: Cluster 1 more likely to both not use and also to be dependent.

Substance Abuse Levels

cluster_plot(51,53)

Cluster 2 has a higher chance of not taking non-substance abuse related drugs and is more likely to take one substance related drug.

Social

cluster_plot(45,50)

Cluster 2 is less likely to experience abuse and suicide, and more likely to have a history of violence and be charged with disorderly conduct

Principal Component Analysis

In this section we will explore the ADHD data set further using Principal Component Analysis (PCA). PCA is a dimensionality reduction method that can be very useful for detecting patterns in complex, feature-rich data. At it’s core, PCA is a fairly simple linear algebra manipulation, an eigendecomposition. Basically, PCA maps the data onto a new set of components that consist of orthogonal axes (eigenvectors) and magnitudes (eigenvalues). Each successive component is added such that it describes as much remaining variance in the data as possible. From the resulting components we can select a subset that describe most of the variance in a new, lower-dimensional feature space which is more convenient for observing relationships in the data that were previously masked by the data’s complexity.

To approach this analysis, we derive 2 sets of features from the original data. Data features that correspond to 1) self-reported answers for an ADHD survey, 2) self-reported answers for a Mood Disorder survey.

PCA: ADHD Self-Report Survey

The Adult ADHD Self-Reporting Scale Symptoms Checklist is a survey consisting of 18 questions that all range on an ordinal categorical scale: ‘Never’, ‘Rarely’, ‘Sometimes’, ‘Often’, ‘Very Often’

Preprocessing

To prepare the ADHD survey responses we performed the following:

  • selected the ADHD survery question responses and exclude the feature adhd_total which gives a summary score for the survey
  • PCA works best with numeric data, so we will transform the ordinal categorical features to numeric values
  • evaluate the presence of missing values in the data
# preparing the ADHD self-report questionnaire data
adhd_df <- df %>%
  select( starts_with("adhd_") ) %>%
  select( -'adhd_total' )
# recode all questions from factor to numeric values
adhd_df <- adhd_df %>%
  mutate_at( 
    vars(matches("adhd_q") ), 
         funs( recode_factor( ., 'Never'=0,'Rarely'=1,'Sometimes'=2,'Often'=3,'Very Often'=4 ) ) )  %>%
  mutate_if( is.factor, as.numeric )
paste( 'initial number of rows:', dim( adhd_df )[1] )
## [1] "initial number of rows: 175"

Summary of missing responses in the ADHD Self-Reporting Survey:

# return a summary table of the missing data in each column
miss_var_summary( adhd_df )
## # A tibble: 18 × 3
##    variable n_miss pct_miss
##    <chr>     <int>    <dbl>
##  1 adhd_q5       1    0.571
##  2 adhd_q1       0    0    
##  3 adhd_q2       0    0    
##  4 adhd_q3       0    0    
##  5 adhd_q4       0    0    
##  6 adhd_q6       0    0    
##  7 adhd_q7       0    0    
##  8 adhd_q8       0    0    
##  9 adhd_q9       0    0    
## 10 adhd_q10      0    0    
## 11 adhd_q11      0    0    
## 12 adhd_q12      0    0    
## 13 adhd_q13      0    0    
## 14 adhd_q14      0    0    
## 15 adhd_q15      0    0    
## 16 adhd_q16      0    0    
## 17 adhd_q17      0    0    
## 18 adhd_q18      0    0

We see that there is only 1 missing value, so it seems reasonable to just drop it

# remove missing
adhd_dna <- adhd_df %>%
  drop_na()
paste( 'after drop_na number of rows:', dim( adhd_dna )[1] )
## [1] "after drop_na number of rows: 174"

PCA from scratch

We can perform PCA easily from scratch and use the intermediate steps to learn more about the relationship of the individual question features to the ADHD total score:

  • First: we need to scale the data.
    • This we de-mean the data and normalize the range of all features. Perhaps, redundant because all questions for this survey are on the same scale.
    • use the scale() function
  • Second: we find the covariance matrix
    • use the cov() function
# scale the data to facilitate PCA
adhd_df_scaled <- scale( adhd_dna )
# create a covariance matrix
adhd_df_cov <- cov( adhd_df_scaled )

Visualizing the covariance matrix

pheatmap(adhd_df_cov, display_numbers = T, color = colorRampPalette(c('white','red'))(100), cluster_rows = F, cluster_cols = F, fontsize_number = 15)

The values off the diagonal of the matrix describe the degree of covariance between the two features. We see that some pairs of questions are more closely related than others. For instance, questions #5 and #13 have the highest covariance:

  • adhd_q5 - “How often do you fidget or squirm with your hands or feet when you have to sit down for a long time?”
  • adhd_q13 - “How often do you feel restless or fidgety”

Perhaps it is no surprise that subjects who report the frequency they fidget when sitting down for a long time report a similar incidence of general fidgeting/squirming.

Another closely covarying pair of questions are #16 and #18

  • adhd_q16 - “When you’re in a conversation, how often do you find yourself finishing the sentences of the people you are talking to, before they can finish them themselves?”
  • adhd_q18 - “How often do you interrupt others when they are busy?”

Again, these are two very similar questions.
It is equally interesting to evaluate the questions with the lowest covariance, questions #3 and #16(see above).

  • adhd_q3 - “How often do you have problems remembering appointments or obligations?”

Indeed, conversation skills and the ability to remember appointments are two (subjectively) different tasks, so it is reasonable to not see a strong relation between the two.

Moving on, the covariance matrix can now be used for an eigendecomposition to find the principal components that describe the most variance in the data

# eigendecomposition of the covariance matrix
eig <- eigen( adhd_df_cov )
explained_variance <- eig$values
components <- eig$vectors
explained_variance
##  [1] 9.3075284 1.3703108 1.0189437 0.8140854 0.7215297 0.5939121 0.5529787
##  [8] 0.5013390 0.4471306 0.4144583 0.3953239 0.3693263 0.3523467 0.2795267
## [15] 0.2734344 0.2160613 0.2060649 0.1656991

Now, by storing the eigen values in a dataframe, we can explore the amount of variance explained by the PCA components. Furthermore, we find the ratio of explained variance by normalizing the cumulative variance of the components

compnum <- 1:length( explained_variance )
exv_cumsum <- cumsum( explained_variance )/length( explained_variance )
pca_res <- data.frame( compnum, explained_variance, exv_cumsum )
pca_res
##    compnum explained_variance exv_cumsum
## 1        1          9.3075284  0.5170849
## 2        2          1.3703108  0.5932133
## 3        3          1.0189437  0.6498213
## 4        4          0.8140854  0.6950482
## 5        5          0.7215297  0.7351332
## 6        6          0.5939121  0.7681283
## 7        7          0.5529787  0.7988494
## 8        8          0.5013390  0.8267015
## 9        9          0.4471306  0.8515421
## 10      10          0.4144583  0.8745676
## 11      11          0.3953239  0.8965300
## 12      12          0.3693263  0.9170482
## 13      13          0.3523467  0.9366230
## 14      14          0.2795267  0.9521522
## 15      15          0.2734344  0.9673430
## 16      16          0.2160613  0.9793464
## 17      17          0.2060649  0.9907945
## 18      18          0.1656991  1.0000000

Visualizing the cumulative explained variance described by the principal components with a Skree Plot:

ggplot( pca_res, aes( x = compnum, y = exv_cumsum ) ) +
  geom_line() +
  ylim( c(0,1.1) ) +
  scale_x_continuous(breaks = seq(0, length( explained_variance ), by = 1)) +
  geom_hline( yintercept = 0.95, linetype = 'dotted', col = 'red') +
  annotate("text", x = 2, y = 0.98, 
           label = expression( "95%" ~ sigma), vjust = -0.5) +
  theme_minimal() +
  xlab( 'Principal Component Number' ) +
  ylab( 'Cummulative Explained Variance' ) +
  ggtitle( 'Scree Plot: Explained Variance of Principal Components' )

The figure above plots the cumulative explained variance of the ADHD survey questions and total score as a result of the eigendecomposition. We see that the first principal component accounts for 51.7% of the total variance. Adding successive components gradually increases the cumulative explained variance. It takes as many as 14 components to describe 95% of the data’s variance. Therefore, dimensionality reduction by way of PCA only moderately decreases the components necessary to adequately explain the data (with a threshold of 95%).

PCA using prcomp()

We can determine the principal components of the ADHD data more quickly using the prcomp() function from the stats library.

# use the prcomp function
adhd_dna.pca <- prcomp( adhd_dna, center = TRUE, scale = TRUE )
# print the pca summary
summary( adhd_dna.pca )
## Importance of components:
##                           PC1     PC2     PC3     PC4     PC5    PC6     PC7
## Standard deviation     3.0508 1.17060 1.00943 0.90227 0.84943 0.7707 0.74363
## Proportion of Variance 0.5171 0.07613 0.05661 0.04523 0.04008 0.0330 0.03072
## Cumulative Proportion  0.5171 0.59321 0.64982 0.69505 0.73513 0.7681 0.79885
##                            PC8     PC9    PC10    PC11    PC12    PC13    PC14
## Standard deviation     0.70805 0.66868 0.64378 0.62875 0.60772 0.59359 0.52870
## Proportion of Variance 0.02785 0.02484 0.02303 0.02196 0.02052 0.01957 0.01553
## Cumulative Proportion  0.82670 0.85154 0.87457 0.89653 0.91705 0.93662 0.95215
##                           PC15   PC16    PC17    PC18
## Standard deviation     0.52291 0.4648 0.45394 0.40706
## Proportion of Variance 0.01519 0.0120 0.01145 0.00921
## Cumulative Proportion  0.96734 0.9794 0.99079 1.00000

It is interesting to compare the Cumulative Proportion from the prcomp() output with the feature pca_res$exv_cumsum; the results are the same for both the built-in PCA function as well as are PCA from scratch (a relief!).

The PCA object that results from prcomp() can be used to visualize the components using the ggbiplot library:

library( ggbiplot )
ggbiplot( adhd_dna.pca )

The biplot shown above renders the data in the feature space of principal components PC1 and PC2; these are the components that explain most of the variance in the data. The scatterplot points show the projection of the data onto PC1 & PC2. The red arrows represent the original feature space that the data was represented in. We can see that the original features have the largest projection onto the PC1 axis. Relatively few original features have a sizeable projection onto PC2 (e.g. adhd_q16 & adhd_q17). Additionally, these vectors all have a positive component along PC1 showing that the result for each question has a positive relation to PC1. The size of the angle between a given pair of vectors is inversely related to the correlation of the pair. For example, we see that adhd_q3 and adhd_q16 have the largest angle between them; this corresponds to our earlier observation looking at the covariance matrix.

Mood Disorders self-report

The Mood Disorder Questionnaire is a self-reporting survey that can be used to identify subjects that may be bipolar.

We can perform a similar PCA analysis for the Mood Disorder survey results. Taking similar preprocessing steps: 1) select mood disorder question features, evaluate missing values…

# prepare the mood disorder data and recode all questions from factor to numeric values
md_df <- df %>%
  select( starts_with("md_") ) %>%
  select( -'md_total' ) %>%
  mutate_if( is.factor, as.numeric )

# return a summary table of the missing data in each column
miss_var_summary( md_df )
## # A tibble: 15 × 3
##    variable n_miss pct_miss
##    <chr>     <int>    <dbl>
##  1 md_q1a        0        0
##  2 md_q1b        0        0
##  3 md_q1c        0        0
##  4 md_q1d        0        0
##  5 md_q1e        0        0
##  6 md_q1f        0        0
##  7 md_q1g        0        0
##  8 md_q1h        0        0
##  9 md_q1i        0        0
## 10 md_q1j        0        0
## 11 md_q1k        0        0
## 12 md_q1l        0        0
## 13 md_q1m        0        0
## 14 md_q2         0        0
## 15 md_q3         0        0

Wonderful, there are no missing values, so we can proceed with PCA using prcomp()

# use the prcomp function
md_df.pca <- prcomp( md_df, center = TRUE, scale = TRUE )
# print the pca summary
summary( md_df.pca )
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6     PC7
## Standard deviation     2.3857 1.3474 0.9721 0.94891 0.91563 0.82515 0.79246
## Proportion of Variance 0.3794 0.1210 0.0630 0.06003 0.05589 0.04539 0.04187
## Cumulative Proportion  0.3794 0.5004 0.5635 0.62348 0.67937 0.72476 0.76663
##                            PC8     PC9    PC10    PC11    PC12   PC13    PC14
## Standard deviation     0.77044 0.74286 0.72814 0.70302 0.65162 0.5835 0.55685
## Proportion of Variance 0.03957 0.03679 0.03535 0.03295 0.02831 0.0227 0.02067
## Cumulative Proportion  0.80620 0.84299 0.87834 0.91128 0.93959 0.9623 0.98296
##                           PC15
## Standard deviation     0.50553
## Proportion of Variance 0.01704
## Cumulative Proportion  1.00000

Now to visualize the PCA biplot:

ggbiplot( md_df.pca ) + xlim( -2,2 )

A similar pattern emerges for the Mood Disorder survey responses. We see that all the projections again have a major component along PC1. Again, we can infer relationships between features from the biplot: md_q1c and md_q3 have the widest angle separating them and are therefore the least correlated features. where:

  • md_q1c - Has there been a period of time when (…) you felt much more self-confident than usual
    • encoding - No:0, Yes:1
  • md_q3 - How much of a problem did any of these cause you - being unable to work; having family, money or legal troubles; getting into arguments or fights
    • encoding - No Problem:0, Minor Problem:1, Moderate Problem:2, Serious Problem:3

Responses to md_q1c with a ‘Yes’ reflect a positive affect with a larger numeric value. Conversely, larger numeric values for md_q3 are associated with a negative aspect. In light of the opposite score polarities, it is not surprising to see a lack of correlation between these two features in the PC1/PC2 space.

On major difference between the Mood Disorder and ADHD biplot is that all the feature vectors point in a negative direction in relation to PC1. This suggests inverse relationship between the features variables and PC1. You may recall from the previous section that the responses to the ADHD survey were all positive in relation to PC1.

Gradient Boosting

Gradient Boosting builds a prediction model from an ensemble of smaller ‘weak learner’ models. In this section we will implement the gradient boosting approach to fit a binary classification model to features from the ADHD data set to predict whether a patient attempts suicide.

Data Preparation

Before fitting a Gradient Boost Model, we will collect a new subset of features for this approach. Instead of including all features for individual questions on either of the summaries, we will only be using the total scores (adhd_total and md_total) because the total scores are composite features build from the scores on the questions. Additionally, the categorical responses

ordinal_cats <- c( 'alcohol', 'thc', 'cocaine', 'stimulants', 'sedative_hypnotics', 'opioids', 'suicide',
                   'court_order', 'hx_of_violence', 'disorderly_conduct', 'non_subst_dx', 'subst_dx')
gboost_df <- df %>%
  select( -starts_with("md_q") ) %>%
  select( -starts_with("adhd_q")) %>%
  select( -c('initial') ) %>%
  dplyr::mutate( across( all_of( ordinal_cats ), as.numeric ) )
paste( 'number of rows:', dim( gboost_df )[1] )
## [1] "number of rows: 175"
glimpse( gboost_df )
## Rows: 175
## Columns: 19
## $ age                <int> 24, 48, 51, 43, 34, 39, 41, 48, 44, 27, 44, 56, 53,…
## $ sex                <fct> Male, Female, Female, Male, Male, Female, Female, M…
## $ race               <fct> White, White, White, White, White, White, White, Wh…
## $ adhd_total         <int> 40, 55, 31, 45, 48, 55, 54, 41, 56, 56, 42, 38, 31,…
## $ md_total           <int> 15, 14, 5, 13, 7, 14, 9, 7, 12, 11, 16, 0, 11, 10, …
## $ alcohol            <dbl> 2, 1, 1, 2, 2, 2, 4, 1, 2, 1, 4, 1, 2, 1, 4, 1, 1, …
## $ thc                <dbl> 2, 1, 1, 2, 2, 1, 4, 1, 1, 4, 2, 1, 2, 1, 2, 1, 1, …
## $ cocaine            <dbl> 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, …
## $ stimulants         <dbl> 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, …
## $ sedative_hypnotics <dbl> 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, …
## $ opioids            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 4, 1, 1, 1, 1, 4, 1, …
## $ court_order        <dbl> 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, …
## $ education          <int> 11, 14, 12, 12, 9, 11, 12, 16, 12, 9, 12, 18, 12, 1…
## $ hx_of_violence     <dbl> 1, 1, 1, 1, 2, 1, 2, 2, 2, 1, 2, 1, 1, 1, 2, 1, 1, …
## $ disorderly_conduct <dbl> 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, …
## $ suicide            <dbl> 2, 2, 1, 2, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, …
## $ abuse              <fct> "No", "Physical & Sexual", "Sexual & Emotional", "P…
## $ non_subst_dx       <dbl> 3, 2, 3, 3, 3, 1, 2, 3, 2, 2, 2, 3, 1, 2, 1, 3, 3, …
## $ subst_dx           <dbl> 1, 1, 1, 1, 1, 1, 1, 2, 1, 3, 3, 1, 1, 1, 2, 2, 1, …

Evaluate the presence of missing data

miss_var_summary( gboost_df )
## # A tibble: 19 × 3
##    variable           n_miss pct_miss
##    <chr>               <int>    <dbl>
##  1 subst_dx               23    13.1 
##  2 non_subst_dx           22    12.6 
##  3 abuse                  14     8   
##  4 suicide                13     7.43
##  5 hx_of_violence         11     6.29
##  6 disorderly_conduct     11     6.29
##  7 education               9     5.14
##  8 court_order             5     2.86
##  9 alcohol                 4     2.29
## 10 thc                     4     2.29
## 11 cocaine                 4     2.29
## 12 stimulants              4     2.29
## 13 sedative_hypnotics      4     2.29
## 14 opioids                 4     2.29
## 15 age                     0     0   
## 16 sex                     0     0   
## 17 race                    0     0   
## 18 adhd_total              0     0   
## 19 md_total                0     0
gg_miss_upset( gboost_df )

We see that the feature psych_meds is missing many values, so we will drop this feature from further consideration. Additionally, we see that there are approximately two dozen rows that are missing values for multiple features. Therefore, instead of imputing the missing, we will simply drop these rows.

gboost_df <- gboost_df %>%
  #select( -psych_meds ) %>%
  drop_na()
paste( 'number of rows:', dim( gboost_df )[1] )
## [1] "number of rows: 142"
miss_var_summary( gboost_df )
## # A tibble: 19 × 3
##    variable           n_miss pct_miss
##    <chr>               <int>    <dbl>
##  1 age                     0        0
##  2 sex                     0        0
##  3 race                    0        0
##  4 adhd_total              0        0
##  5 md_total                0        0
##  6 alcohol                 0        0
##  7 thc                     0        0
##  8 cocaine                 0        0
##  9 stimulants              0        0
## 10 sedative_hypnotics      0        0
## 11 opioids                 0        0
## 12 court_order             0        0
## 13 education               0        0
## 14 hx_of_violence          0        0
## 15 disorderly_conduct      0        0
## 16 suicide                 0        0
## 17 abuse                   0        0
## 18 non_subst_dx            0        0
## 19 subst_dx                0        0

We will begin by splitting the data into train and test sets

df2 <- df 

# train test split
set.seed(101)
trainIndex <- createDataPartition(df2$suicide,
                                  p = 0.75,
                                  list = F)

train <- df2[trainIndex,]
test <- df2[-trainIndex,]

# cross validation train control
ctrl <- trainControl(method = 'cv', number = 10)
indexes = createDataPartition(gboost_df$suicide, p = .90, list = F)
train = gboost_df[indexes, ]
test = gboost_df[-indexes, ]
# Train model with preprocessing & repeated cv
model_gbm <- caret::train(suicide ~ .,
                          data = train,
                          method = "gbm",
                          preProcess = c("scale", "center"),
                          trControl = trainControl(method = "repeatedcv", 
                                                  number = 10, 
                                                  repeats = 3, 
                                                  verboseIter = FALSE),
                          verbose = 0)
print(model_gbm)
## Stochastic Gradient Boosting 
## 
## 128 samples
##  18 predictor
## 
## Pre-processing: scaled (28), centered (28) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 115, 115, 116, 115, 115, 115, ... 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  RMSE       Rsquared    MAE      
##   1                   50      0.4618525  0.11416303  0.4067475
##   1                  100      0.4716013  0.10871921  0.4061297
##   1                  150      0.4763428  0.11119615  0.4071995
##   2                   50      0.4759660  0.09982923  0.4143810
##   2                  100      0.4832988  0.10076528  0.4164349
##   2                  150      0.4803671  0.11094950  0.4084625
##   3                   50      0.4751328  0.09992148  0.4062867
##   3                  100      0.4891577  0.09489353  0.4119593
##   3                  150      0.4895725  0.10903967  0.4100652
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 50, interaction.depth =
##  1, shrinkage = 0.1 and n.minobsinnode = 10.
#caret::confusionMatrix(
#  data = predict(model_gbm, test),
#  reference = test$suicide
#  )

Support Vector Machine